In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt#visualization
import seaborn as sns#visualizations
import plotly.express as px#visualizations
import plotly.graph_objects as go#visualizations
from plotly.subplots import make_subplots#visualizations
import datetime as dt
from datetime import timedelta
In [2]:
from sklearn.model_selection import GridSearchCV#Hyper-parameter Tuning
from sklearn.preprocessing import StandardScaler#scaling
from sklearn.cluster import KMeans#clustering
from sklearn.metrics import silhouette_score,silhouette_samples
from sklearn.linear_model import LinearRegression,Ridge,Lasso#linear regression and regularization
from sklearn.svm import SVR#Support vector machine
from sklearn.metrics import mean_squared_error,r2_score#model evaluation metrics
import statsmodels.api as sm
from statsmodels.tsa.api import Holt,SimpleExpSmoothing,ExponentialSmoothing
from sklearn.preprocessing import PolynomialFeatures#Polynomial Regression
from statsmodels.tsa.stattools import adfuller#Augmented Dickey Fuller Test
In [3]:
covid=pd.read_csv("covid_19_data.csv")

Exploratory Data Analysis

In [4]:
covid.head()
Out[4]:
SNo ObservationDate Province/State Country/Region Last Update Confirmed Deaths Recovered
0 1 01/22/2020 Anhui Mainland China 1/22/2020 17:00 1.0 0.0 0.0
1 2 01/22/2020 Beijing Mainland China 1/22/2020 17:00 14.0 0.0 0.0
2 3 01/22/2020 Chongqing Mainland China 1/22/2020 17:00 6.0 0.0 0.0
3 4 01/22/2020 Fujian Mainland China 1/22/2020 17:00 1.0 0.0 0.0
4 5 01/22/2020 Gansu Mainland China 1/22/2020 17:00 0.0 0.0 0.0
In [5]:
print("Size/Shape of the dataset: ",covid.shape)
Size/Shape of the dataset:  (116805, 8)
In [6]:
print("Checking for null values:\n",covid.isnull().sum())
Checking for null values:
 SNo                    0
ObservationDate        0
Province/State     35353
Country/Region         0
Last Update            0
Confirmed              0
Deaths                 0
Recovered              0
dtype: int64
In [7]:
print("Checking Data-type of each column:\n",covid.dtypes)
Checking Data-type of each column:
 SNo                  int64
ObservationDate     object
Province/State      object
Country/Region      object
Last Update         object
Confirmed          float64
Deaths             float64
Recovered          float64
dtype: object
In [8]:
covid.drop(["SNo"],1,inplace=True)
In [9]:
#Converting "Observation Date" into Datetime format
covid["ObservationDate"]=pd.to_datetime(covid["ObservationDate"])
In [10]:
#performing aggregation based on sum of confirmed,recovered and active cases per country
grouped_country=covid.groupby(["Country/Region","ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
In [11]:
grouped_country["Active Cases"]=grouped_country["Confirmed"]-grouped_country["Recovered"]-grouped_country["Deaths"]
grouped_country["log_confirmed"]=np.log(grouped_country["Confirmed"])
grouped_country["log_active"]=np.log(grouped_country["Active Cases"])
In [12]:
#Grouping different types of cases as per the date
datewise=covid.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
#creating a columns of number of days from the start date
datewise["Days Since"]=datewise.index-datewise.index.min()

Some Basic Information

In [13]:
print("Totol number of countries with Disease Spread: ",len(covid["Country/Region"].unique()))
print("Total number of Confirmed Cases around the World: ",datewise["Confirmed"].iloc[-1])
print("Total number of Recovered Cases around the World: ",datewise["Recovered"].iloc[-1])
print("Total number of Deaths Cases around the World: ",datewise["Deaths"].iloc[-1])
print("Total number of Active Cases around the World: ",(datewise["Confirmed"].iloc[-1]-datewise["Recovered"].iloc[-1]-datewise["Deaths"].iloc[-1]))
print("Total number of Closed Cases around the World: ",datewise["Recovered"].iloc[-1]+datewise["Deaths"].iloc[-1])
print("Approximate number of Confirmed Cases per Day around the World: ",np.round(datewise["Confirmed"].iloc[-1]/datewise.shape[0]))
print("Approximate number of Recovered Cases per Day around the World: ",np.round(datewise["Recovered"].iloc[-1]/datewise.shape[0]))
print("Approximate number of Death Cases per Day around the World: ",np.round(datewise["Deaths"].iloc[-1]/datewise.shape[0]))
print("Approximate number of Confirmed Cases per hour around the World: ",np.round(datewise["Confirmed"].iloc[-1]/((datewise.shape[0])*24)))
print("Approximate number of Recovered Cases per hour around the World: ",np.round(datewise["Recovered"].iloc[-1]/((datewise.shape[0])*24)))
print("Approximate number of Death Cases per hour around the World: ",np.round(datewise["Deaths"].iloc[-1]/((datewise.shape[0])*24)))
print("Number of Confirmed Cases in last 24 hours: ",datewise["Confirmed"].iloc[-1]-datewise["Confirmed"].iloc[-2])
print("Number of Recovered Cases in last 24 hours: ",datewise["Recovered"].iloc[-1]-datewise["Recovered"].iloc[-2])
print("Number of Death Cases in last 24 hours: ",datewise["Deaths"].iloc[-1]-datewise["Deaths"].iloc[-2])
Totol number of countries with Disease Spread:  223
Total number of Confirmed Cases around the World:  31779835.0
Total number of Recovered Cases around the World:  21890442.0
Total number of Deaths Cases around the World:  975104.0
Total number of Active Cases around the World:  8914289.0
Total number of Closed Cases around the World:  22865546.0
Approximate number of Confirmed Cases per Day around the World:  129186.0
Approximate number of Recovered Cases per Day around the World:  88986.0
Approximate number of Death Cases per Day around the World:  3964.0
Approximate number of Confirmed Cases per hour around the World:  5383.0
Approximate number of Recovered Cases per hour around the World:  3708.0
Approximate number of Death Cases per hour around the World:  165.0
Number of Confirmed Cases in last 24 hours:  262748.0
Number of Recovered Cases in last 24 hours:  266008.0
Number of Death Cases in last 24 hours:  5526.0

ACTIVE CASES

In [14]:
#Active Cases = Number of Confirmed Cases - Number of Recovered Cases - Number of Death Cases
fig=px.bar(x=datewise.index,y=datewise["Confirmed"]-datewise["Recovered"]-datewise["Deaths"])
fig.update_layout(title="Distribution of Number of Active Cases",
                  xaxis_title="Date",yaxis_title="Number of Cases",)
fig.show()

CLOSED CASES

In [15]:
#Closed Cases = Number of Recovered Cases + Number of Death Cases
fig=px.bar(x=datewise.index,y=datewise["Recovered"]+datewise["Deaths"])
fig.update_layout(title="Distribution of Number of Closed Cases",
                  xaxis_title="Date",yaxis_title="Number of Cases")
fig.show()
In [16]:
#WEEKLY GROWTH 
datewise["WeekOfYear"]=datewise.index.weekofyear

week_num=[]
weekwise_confirmed=[]
weekwise_recovered=[]
weekwise_deaths=[]
w=1
for i in list(datewise["WeekOfYear"].unique()):
    weekwise_confirmed.append(datewise[datewise["WeekOfYear"]==i]["Confirmed"].iloc[-1])
    weekwise_recovered.append(datewise[datewise["WeekOfYear"]==i]["Recovered"].iloc[-1])
    weekwise_deaths.append(datewise[datewise["WeekOfYear"]==i]["Deaths"].iloc[-1])
    week_num.append(w)
    w=w+1

fig=go.Figure()
fig.add_trace(go.Scatter(x=week_num, y=weekwise_confirmed,
                    mode='lines+markers',
                    name='Weekly Growth of Confirmed Cases'))
fig.add_trace(go.Scatter(x=week_num, y=weekwise_recovered,
                    mode='lines+markers',
                    name='Weekly Growth of Recovered Cases'))
fig.add_trace(go.Scatter(x=week_num, y=weekwise_deaths,
                    mode='lines+markers',
                    name='Weekly Growth of Death Cases'))
fig.update_layout(title="Weekly Growth of different types of Cases",
                 xaxis_title="Week Number",yaxis_title="Number of Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [17]:
#Growth rate of Confirmed, Recovered and Death Cases
fig=go.Figure()
fig.add_trace(go.Scatter(x=datewise.index, y=datewise["Confirmed"],
                    mode='lines+markers',
                    name='Confirmed Cases'))
fig.add_trace(go.Scatter(x=datewise.index, y=datewise["Recovered"],
                    mode='lines+markers',
                    name='Recovered Cases'))
fig.add_trace(go.Scatter(x=datewise.index, y=datewise["Deaths"],
                    mode='lines+markers',
                    name='Death Cases'))
fig.update_layout(title="Growth of different types of cases",
                 xaxis_title="Date",yaxis_title="Number of Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [18]:
#MORTALITY AND RECOVERY RATE ANALYSIS OVER THE WORLD

#Mortality rate = (Number of Death Cases / Number of Confirmed Cases) x 100
#Recovery Rate= (Number of Recoverd Cases / Number of Confirmed Cases) x 100

datewise["Mortality Rate"]=(datewise["Deaths"]/datewise["Confirmed"])*100
datewise["Recovery Rate"]=(datewise["Recovered"]/datewise["Confirmed"])*100
datewise["Active Cases"]=datewise["Confirmed"]-datewise["Recovered"]-datewise["Deaths"]
datewise["Closed Cases"]=datewise["Recovered"]+datewise["Deaths"]

print("Average Mortality Rate",datewise["Mortality Rate"].mean())
print("Median Mortality Rate",datewise["Mortality Rate"].median())
print("Average Recovery Rate",datewise["Recovery Rate"].mean())
print("Median Recovery Rate",datewise["Recovery Rate"].median())

#Plotting Mortality and Recovery Rate 
fig = make_subplots(rows=2, cols=1,
                   subplot_titles=("Recovery Rate", "Mortatlity Rate"))
fig.add_trace(
    go.Scatter(x=datewise.index, y=(datewise["Recovered"]/datewise["Confirmed"])*100,name="Recovery Rate"),
    row=1, col=1
)
fig.add_trace(
    go.Scatter(x=datewise.index, y=(datewise["Deaths"]/datewise["Confirmed"])*100,name="Mortality Rate"),
    row=2, col=1
)
fig.update_layout(height=1000,legend=dict(x=-0.1,y=1.2,traceorder="normal"))
fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_yaxes(title_text="Recovery Rate", row=1, col=1)
fig.update_xaxes(title_text="Date", row=1, col=2)
fig.update_yaxes(title_text="Mortality Rate", row=1, col=2)
fig.show()
Average Mortality Rate 4.489434703521251
Median Mortality Rate 4.13538370495493
Average Recovery Rate 42.03177180081855
Median Recovery Rate 45.82947437974793
In [19]:
#Mortality rate is showing a considerable (low) for a pretty long time, which is positive sign.
#Recovery Rate has started to pick up again which is a good sign, another supportive reason to why number of Closed Cases are increasing
In [20]:
#Calculating countrywise Moratality and Recovery Rate
countrywise=covid[covid["ObservationDate"]==covid["ObservationDate"].max()].groupby(["Country/Region"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'}).sort_values(["Confirmed"],ascending=False)
countrywise["Mortality"]=(countrywise["Deaths"]/countrywise["Confirmed"])*100
countrywise["Recovery"]=(countrywise["Recovered"]/countrywise["Confirmed"])*100
In [21]:
country_last_24_confirmed=[]
country_last_24_recovered=[]
country_last_24_deaths=[]
for country in countrywise.index:
    country_last_24_confirmed.append((grouped_country.loc[country].iloc[-1]-grouped_country.loc[country].iloc[-2])["Confirmed"])
    country_last_24_recovered.append((grouped_country.loc[country].iloc[-1]-grouped_country.loc[country].iloc[-2])["Recovered"])
    country_last_24_deaths.append((grouped_country.loc[country].iloc[-1]-grouped_country.loc[country].iloc[-2])["Deaths"])
In [22]:
Last_24_Hours_country=pd.DataFrame(list(zip(countrywise.index,country_last_24_confirmed,country_last_24_recovered,country_last_24_deaths)),
                                   columns=["Country Name","Last 24 Hours Confirmed","Last 24 Hours Recovered","Last 24 Hours Deaths"])
In [23]:
Top_20_Confirmed_24hr=Last_24_Hours_country.sort_values(["Last 24 Hours Confirmed"],ascending=False).head(20)
Top_20_Recoverd_24hr=Last_24_Hours_country.sort_values(["Last 24 Hours Recovered"],ascending=False).head(20)
Top_20_Deaths_24hr=Last_24_Hours_country.sort_values(["Last 24 Hours Deaths"],ascending=False).head(20)


fig, (ax1, ax2, ax3) = plt.subplots(3, 1,figsize=(10,20))
sns.barplot(x=Top_20_Confirmed_24hr["Last 24 Hours Confirmed"],y=Top_20_Confirmed_24hr["Country Name"],ax=ax1)
ax1.set_title("Top 20 Countries with Highest Number of Confirmed Cases in Last 24 Hours")
sns.barplot(x=Top_20_Recoverd_24hr["Last 24 Hours Recovered"],y=Top_20_Recoverd_24hr["Country Name"],ax=ax2)
ax2.set_title("Top 20 Countries with Highest Number of Recovered Cases in Last 24 Hours")
sns.barplot(x=Top_20_Deaths_24hr["Last 24 Hours Deaths"],y=Top_20_Deaths_24hr["Country Name"],ax=ax3)
ax3.set_title("Top 20 Countries with Highest Number of Death Cases in Last 24 Hours")
Out[23]:
Text(0.5, 1.0, 'Top 20 Countries with Highest Number of Death Cases in Last 24 Hours')
In [ ]:
 

Clustering of Countries based on recovery rate and mortality rate

The clustering of countries can be done considering different features. Here I'm trying to cluster different countries based on the Mortality and Recovery rate of indivisual country. As we all are well aware that COVID-19 has different Mortality Rate among different countries based on different factors and so is the Recovery Rate because of pandemic controlling practices followed by the individual country. Also Mortality Rate and Recovery Rate both togther takes into account all types of cases Confirmed, Recoverd and Deaths. Let's checkout how these clusters look like!
In [24]:
std=StandardScaler()
In [25]:
X=countrywise[["Mortality","Recovery"]]
#Standard Scaling since K-Means Clustering is a distance based alogrithm
X=std.fit_transform(X)
In [26]:
wcss=[]
sil=[]
for i in range(2,11):
    clf=KMeans(n_clusters=i,init='k-means++',random_state=42)
    clf.fit(X)
    labels=clf.labels_
    centroids=clf.cluster_centers_
    sil.append(silhouette_score(X, labels, metric='euclidean'))
    wcss.append(clf.inertia_)
In [27]:
x=np.arange(2,11)
plt.figure(figsize=(10,5))
plt.plot(x,wcss,marker='o')
plt.xlabel("Number of Clusters")
plt.ylabel("Within Cluster Sum of Squares (WCSS)")
plt.title("Elbow Method")
Out[27]:
Text(0.5, 1.0, 'Elbow Method')
In [28]:
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(20,15))
dendogram=sch.dendrogram(sch.linkage(X, method  = "ward"))

All methods namely Elbow Method and Hierarchical Clustering shows K=4 will correct number of clusters.

In [29]:
clf_final=KMeans(n_clusters=4,init='k-means++',random_state=6)
clf_final.fit(X)
Out[29]:
KMeans(n_clusters=4, random_state=6)
In [30]:
countrywise["Clusters"]=clf_final.predict(X)
In [31]:
#Summary of Clusters
cluster_summary=pd.concat([countrywise[countrywise["Clusters"]==3].head(15),countrywise[countrywise["Clusters"]==1].head(15),countrywise[countrywise["Clusters"]==2].head(15),countrywise[countrywise["Clusters"]==0].head(15)])
cluster_summary.style.background_gradient(cmap='Reds').format("{:.2f}")
Out[31]:
Confirmed Recovered Deaths Mortality Recovery Clusters
Country/Region
Yemen 2029.00 1245.00 586.00 28.88 61.36 3.00
MS Zaandam 9.00 0.00 2.00 22.22 0.00 3.00
US 6933548.00 2670256.00 201884.00 2.91 38.51 1.00
Spain 693556.00 150376.00 31034.00 4.47 21.68 1.00
France 508456.00 96498.00 31447.00 6.18 18.98 1.00
UK 412245.00 2247.00 41951.00 10.18 0.55 1.00
Ukraine 189488.00 84767.00 3784.00 2.00 44.73 1.00
Belgium 106887.00 19079.00 9959.00 9.32 17.85 1.00
Netherlands 105304.00 3143.00 6344.00 6.02 2.98 1.00
Sweden 89756.00 0.00 5876.00 6.55 0.00 1.00
Honduras 72675.00 24022.00 2222.00 3.06 33.05 1.00
Ethiopia 71083.00 29253.00 1141.00 1.61 41.15 1.00
Costa Rica 68059.00 26136.00 781.00 1.15 38.40 1.00
Czech Republic 55464.00 26709.00 555.00 1.00 48.16 1.00
Paraguay 35571.00 19867.00 727.00 2.04 55.85 1.00
Serbia 33080.00 0.00 744.00 2.25 0.00 1.00
Lebanon 31792.00 13527.00 328.00 1.03 42.55 1.00
Mexico 710049.00 601611.00 74949.00 10.56 84.73 2.00
Iran 432798.00 365846.00 24840.00 5.74 84.53 2.00
Italy 302537.00 220665.00 35758.00 11.82 72.94 2.00
Canada 149939.00 129850.00 9294.00 6.20 86.60 2.00
Bolivia 131990.00 91556.00 7731.00 5.86 69.37 2.00
Ecuador 129892.00 102852.00 11171.00 8.60 79.18 2.00
Egypt 102375.00 91843.00 5822.00 5.69 89.71 2.00
Mainland China 85314.00 80509.00 4634.00 5.43 94.37 2.00
Ireland 33675.00 23364.00 1794.00 5.33 69.38 2.00
Liberia 1337.00 1219.00 82.00 6.13 91.17 2.00
Niger 1193.00 1107.00 69.00 5.78 92.79 2.00
Chad 1164.00 997.00 82.00 7.04 85.65 2.00
San Marino 723.00 669.00 42.00 5.81 92.53 2.00
Fiji 32.00 28.00 2.00 6.25 87.50 2.00
Western Sahara 10.00 8.00 1.00 10.00 80.00 2.00
India 5646010.00 4587613.00 90020.00 1.59 81.25 0.00
Brazil 4591364.00 4046827.00 138105.00 3.01 88.14 0.00
Russia 1117487.00 920602.00 19720.00 1.76 82.38 0.00
Colombia 784268.00 662277.00 24746.00 3.16 84.45 0.00
Peru 776546.00 636489.00 31568.00 4.07 81.96 0.00
South Africa 665188.00 594229.00 16206.00 2.44 89.33 0.00
Argentina 664799.00 525486.00 14376.00 2.16 79.04 0.00
Chile 449903.00 425165.00 12345.00 2.74 94.50 0.00
Bangladesh 353844.00 262953.00 5044.00 1.43 74.31 0.00
Iraq 332635.00 264988.00 8754.00 2.63 79.66 0.00
Saudi Arabia 331359.00 313786.00 4569.00 1.38 94.70 0.00
Pakistan 308217.00 294392.00 6437.00 2.09 95.51 0.00
Turkey 308069.00 270723.00 7711.00 2.50 87.88 0.00
Philippines 294591.00 231373.00 5091.00 1.73 78.54 0.00
Germany 279025.00 245706.00 9423.00 3.38 88.06 0.00
In [32]:
print("Avergae Mortality Rate of Cluster 0: ",countrywise[countrywise["Clusters"]==0]["Mortality"].mean())
print("Avergae Recovery Rate of Cluster 0: ",countrywise[countrywise["Clusters"]==0]["Recovery"].mean())
print("Avergae Mortality Rate of Cluster 1: ",countrywise[countrywise["Clusters"]==1]["Mortality"].mean())
print("Avergae Recovery Rate of Cluster 1: ",countrywise[countrywise["Clusters"]==1]["Recovery"].mean())
print("Avergae Mortality Rate of Cluster 2: ",countrywise[countrywise["Clusters"]==2]["Mortality"].mean())
print("Avergae Recovery Rate of Cluster 2: ",countrywise[countrywise["Clusters"]==2]["Recovery"].mean())
print("Avergae Mortality Rate of Cluster 3: ",countrywise[countrywise["Clusters"]==3]["Mortality"].mean())
print("Avergae Recovery Rate of Cluster 3: ",countrywise[countrywise["Clusters"]==3]["Recovery"].mean())
Avergae Mortality Rate of Cluster 0:  1.7724379352829753
Avergae Recovery Rate of Cluster 0:  85.02283261697687
Avergae Mortality Rate of Cluster 1:  2.9288428756901457
Avergae Recovery Rate of Cluster 1:  35.86523191913666
Avergae Mortality Rate of Cluster 2:  7.082463686252098
Avergae Recovery Rate of Cluster 2:  84.03051992695463
Avergae Mortality Rate of Cluster 3:  25.55172224960298
Avergae Recovery Rate of Cluster 3:  30.680137999014296
In [33]:
plt.figure(figsize=(10,5))
sns.scatterplot(x=countrywise["Recovery"],y=countrywise["Mortality"],hue=countrywise["Clusters"],s=100)
plt.axvline(((datewise["Recovered"]/datewise["Confirmed"])*100).mean(),
            color='red',linestyle="--",label="Mean Recovery Rate around the World")
plt.axhline(((datewise["Deaths"]/datewise["Confirmed"])*100).mean(),
            color='black',linestyle="--",label="Mean Mortality Rate around the World")
plt.legend()
Out[33]:
<matplotlib.legend.Legend at 0x7fee52fd1d90>
In [34]:
print("Few Countries belonging to Cluster 0: ",list(countrywise[countrywise["Clusters"]==0].head(10).index))
print("Few Countries belonging to Cluster 1: ",list(countrywise[countrywise["Clusters"]==1].head(10).index))
print("Few Countries belonging to Cluster 2: ",list(countrywise[countrywise["Clusters"]==2].head(10).index))
print("Few Countries belonging to Cluster 3: ",list(countrywise[countrywise["Clusters"]==3].head(10).index))
Few Countries belonging to Cluster 0:  ['India', 'Brazil', 'Russia', 'Colombia', 'Peru', 'South Africa', 'Argentina', 'Chile', 'Bangladesh', 'Iraq']
Few Countries belonging to Cluster 1:  ['US', 'Spain', 'France', 'UK', 'Ukraine', 'Belgium', 'Netherlands', 'Sweden', 'Honduras', 'Ethiopia']
Few Countries belonging to Cluster 2:  ['Mexico', 'Iran', 'Italy', 'Canada', 'Bolivia', 'Ecuador', 'Egypt', 'Mainland China', 'Ireland', 'Liberia']
Few Countries belonging to Cluster 3:  ['Yemen', 'MS Zaandam']

Cluster 0 is set of countries which have Low Mortality Rate and really High Recovery Rate.

Cluster 1 is set of countries which have Low Mortality Rate and really Low Recovery Rate. These countries need to pace up their Revovery Rate to get out it, Some thses countries have really high number of Infected Cases but Low Mortality is positive sign out of it.

Cluster 2 is a set of countries which have really High Mortality Rate and consdierably Good Recovery Rate.

Cluster 3 is a set of countries which have high mortality rate and low recovery rate.

In [ ]:
 

Data analysis and Forecasting specific to India

In [35]:
india_data=covid[covid["Country/Region"]=="India"]
datewise_india=india_data.groupby(["ObservationDate"]).agg({"Confirmed":'sum',"Recovered":'sum',"Deaths":'sum'})
print(datewise_india.iloc[-1])
print("Total Active Cases: ",datewise_india["Confirmed"].iloc[-1]-datewise_india["Recovered"].iloc[-1]-datewise_india["Deaths"].iloc[-1])
print("Total Closed Cases: ",datewise_india["Recovered"].iloc[-1]+datewise_india["Deaths"].iloc[-1])
Confirmed    5646010.0
Recovered    4587613.0
Deaths         90020.0
Name: 2020-09-23 00:00:00, dtype: float64
Total Active Cases:  968377.0
Total Closed Cases:  4677633.0
In [36]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=datewise_india.index, y=datewise_india["Confirmed"],
                    mode='lines+markers',
                    name='Confirmed Cases'))
fig.add_trace(go.Scatter(x=datewise_india.index, y=datewise_india["Recovered"],
                    mode='lines+markers',
                    name='Recovered Cases'))
fig.add_trace(go.Scatter(x=datewise_india.index, y=datewise_india["Deaths"],
                    mode='lines+markers',
                    name='Death Cases'))
fig.update_layout(title="Growth of different types of cases in India",
                 xaxis_title="Date",yaxis_title="Number of Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [37]:
fig=px.bar(x=datewise_india.index,y=datewise_india["Confirmed"]-datewise_india["Recovered"]-datewise_india["Deaths"])
fig.update_layout(title="Distribution of Number of Active Cases in India",
                  xaxis_title="Date",yaxis_title="Number of Cases",)
fig.show()
In [38]:
datewise_india["WeekOfYear"]=datewise_india.index.weekofyear

week_num_india=[]
india_weekwise_confirmed=[]
india_weekwise_recovered=[]
india_weekwise_deaths=[]
w=1
for i in list(datewise_india["WeekOfYear"].unique()):
    india_weekwise_confirmed.append(datewise_india[datewise_india["WeekOfYear"]==i]["Confirmed"].iloc[-1])
    india_weekwise_recovered.append(datewise_india[datewise_india["WeekOfYear"]==i]["Recovered"].iloc[-1])
    india_weekwise_deaths.append(datewise_india[datewise_india["WeekOfYear"]==i]["Deaths"].iloc[-1])
    week_num_india.append(w)
    w=w+1
    
fig=go.Figure()
fig.add_trace(go.Scatter(x=week_num_india, y=india_weekwise_confirmed,
                    mode='lines+markers',
                    name='Weekly Growth of Confirmed Cases'))
fig.add_trace(go.Scatter(x=week_num_india, y=india_weekwise_recovered,
                    mode='lines+markers',
                    name='Weekly Growth of Recovered Cases'))
fig.add_trace(go.Scatter(x=week_num_india, y=india_weekwise_deaths,
                    mode='lines+markers',
                    name='Weekly Growth of Death Cases'))
fig.update_layout(title="Weekly Growth of different types of Cases in India",
                 xaxis_title="Week Number",yaxis_title="Number of Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [ ]:
 

Prediction using Machine Learning Models

1--Linear Regression Model for Confirm Cases Prediction

In [39]:
datewise["Days Since"]=datewise.index-datewise.index[0]
datewise["Days Since"]=datewise["Days Since"].dt.days
In [40]:
train_ml=datewise.iloc[:int(datewise.shape[0]*0.95)]
valid_ml=datewise.iloc[int(datewise.shape[0]*0.95):]
model_scores=[]
In [41]:
lin_reg=LinearRegression(normalize=True)
In [42]:
lin_reg.fit(np.array(train_ml["Days Since"]).reshape(-1,1),np.array(train_ml["Confirmed"]).reshape(-1,1))
Out[42]:
LinearRegression(normalize=True)
In [43]:
prediction_valid_linreg=lin_reg.predict(np.array(valid_ml["Days Since"]).reshape(-1,1))
In [44]:
model_scores.append(np.sqrt(mean_squared_error(valid_ml["Confirmed"],prediction_valid_linreg)))
print("Root Mean Square Error for Linear Regression: ",np.sqrt(mean_squared_error(valid_ml["Confirmed"],prediction_valid_linreg)))
Root Mean Square Error for Linear Regression:  7830571.030601998
In [45]:
plt.figure(figsize=(11,6))
prediction_linreg=lin_reg.predict(np.array(datewise["Days Since"]).reshape(-1,1))
linreg_output=[]
for i in range(prediction_linreg.shape[0]):
    linreg_output.append(prediction_linreg[i][0])

fig=go.Figure()
fig.add_trace(go.Scatter(x=datewise.index, y=datewise["Confirmed"],
                    mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=datewise.index, y=linreg_output,
                    mode='lines',name="Linear Regression Best Fit Line",
                    line=dict(color='black', dash='dot')))
fig.update_layout(title="Confirmed Cases Linear Regression Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
<Figure size 792x432 with 0 Axes>

The Linear Regression Model is absolutely falling aprat. As it is clearly visible that the trend of Confirmed Cases in absolutely not Linear.Predictions of Linear Regression are nowhere close to actual values.

In [ ]:
 

2--Polynomial Regression for Prediction of Confirmed Cases

In [46]:
train_ml=datewise.iloc[:int(datewise.shape[0]*0.95)]
valid_ml=datewise.iloc[int(datewise.shape[0]*0.95):]
In [47]:
poly = PolynomialFeatures(degree = 8) 
In [48]:
train_poly=poly.fit_transform(np.array(train_ml["Days Since"]).reshape(-1,1))
valid_poly=poly.fit_transform(np.array(valid_ml["Days Since"]).reshape(-1,1))
y=train_ml["Confirmed"]
In [49]:
linreg=LinearRegression(normalize=True)
linreg.fit(train_poly,y)
Out[49]:
LinearRegression(normalize=True)
In [50]:
prediction_poly=linreg.predict(valid_poly)
rmse_poly=np.sqrt(mean_squared_error(valid_ml["Confirmed"],prediction_poly))
model_scores.append(rmse_poly)
print("Root Mean Squared Error for Polynomial Regression: ",rmse_poly)
Root Mean Squared Error for Polynomial Regression:  985900.2801292373
In [51]:
comp_data=poly.fit_transform(np.array(datewise["Days Since"]).reshape(-1,1))
plt.figure(figsize=(11,6))
predictions_poly=linreg.predict(comp_data)

fig=go.Figure()
fig.add_trace(go.Scatter(x=datewise.index, y=datewise["Confirmed"],
                    mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=datewise.index, y=predictions_poly,
                    mode='lines',name="Polynomial Regression Best Fit",
                    line=dict(color='black', dash='dot')))
fig.update_layout(title="Confirmed Cases Polynomial Regression Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",
                 legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
<Figure size 792x432 with 0 Axes>
In [52]:
new_prediction_poly=[]
for i in range(1,18):
    new_date_poly=poly.fit_transform(np.array(datewise["Days Since"].max()+i).reshape(-1,1))
    new_prediction_poly.append(linreg.predict(new_date_poly)[0])
In [ ]:
 

Support Vector Machine Model Regressor for Prediction of Confirmed Cases

In [53]:
train_ml=datewise.iloc[:int(datewise.shape[0]*0.95)]
valid_ml=datewise.iloc[int(datewise.shape[0]*0.95):]
In [54]:
#Intializing SVR Model
svm=SVR(C=1,degree=6,kernel='poly',epsilon=0.01)
In [55]:
#Fitting model on the training data
svm.fit(np.array(train_ml["Days Since"]).reshape(-1,1),np.array(train_ml["Confirmed"]).reshape(-1,1))
Out[55]:
SVR(C=1, degree=6, epsilon=0.01, kernel='poly')
In [56]:
prediction_valid_svm=svm.predict(np.array(valid_ml["Days Since"]).reshape(-1,1))
In [57]:
model_scores.append(np.sqrt(mean_squared_error(valid_ml["Confirmed"],prediction_valid_svm)))
print("Root Mean Square Error for Support Vectore Machine: ",np.sqrt(mean_squared_error(valid_ml["Confirmed"],prediction_valid_svm)))
Root Mean Square Error for Support Vectore Machine:  9793495.12240128
In [58]:
plt.figure(figsize=(11,6))
prediction_svm=svm.predict(np.array(datewise["Days Since"]).reshape(-1,1))
fig=go.Figure()
fig.add_trace(go.Scatter(x=datewise.index, y=datewise["Confirmed"],
                    mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=datewise.index, y=prediction_svm,
                    mode='lines',name="Support Vector Machine Best fit Kernel",
                    line=dict(color='black', dash='dot')))
fig.update_layout(title="Confirmed Cases Support Vectore Machine Regressor Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
<Figure size 792x432 with 0 Axes>

Support Vector Machine model isn't providing great results now, the predictions are either overshooting or really lower than what's expected.

In [59]:
new_date=[]
new_prediction_lr=[]
new_prediction_svm=[]
for i in range(1,18):
    new_date.append(datewise.index[-1]+timedelta(days=i))
    new_prediction_lr.append(lin_reg.predict(np.array(datewise["Days Since"].max()+i).reshape(-1,1))[0][0])
    new_prediction_svm.append(svm.predict(np.array(datewise["Days Since"].max()+i).reshape(-1,1))[0])
In [60]:
pd.set_option('display.float_format', lambda x: '%.6f' % x)
model_predictions=pd.DataFrame(zip(new_date,new_prediction_lr,new_prediction_poly,new_prediction_svm),
                               columns=["Dates","Linear Regression Prediction","Polynonmial Regression Prediction","SVM Prediction"])
model_predictions.head()
Out[60]:
Dates Linear Regression Prediction Polynonmial Regression Prediction SVM Prediction
0 2020-09-24 23113975.222954 34318786.968828 46306357.160791
1 2020-09-25 23231108.623613 34952218.800302 47364738.610866
2 2020-09-26 23348242.024271 35625017.197156 48444763.017170
3 2020-09-27 23465375.424930 36340167.331798 49546783.004272
4 2020-09-28 23582508.825588 37100814.713970 50671155.488235
In [ ]:
 

Time Series Forecasting

Time series data is different because it is recorded at regular time intervals. Any predictive model based on time series data will have time as an independent variable. The output of a model would be the predicted value or classification at a specific time. The time series models used here are AR model,MA model,ARIMA model,SARIMA model and FBProphet.

1--AR Model (using AUTO ARIMA)

In [61]:
pip install pmdarima
Requirement already satisfied: pmdarima in /Applications/anaconda3/lib/python3.8/site-packages (1.7.1)
Requirement already satisfied: statsmodels<0.12,>=0.11 in /Applications/anaconda3/lib/python3.8/site-packages (from pmdarima) (0.11.1)
Requirement already satisfied: setuptools<50.0.0 in /Applications/anaconda3/lib/python3.8/site-packages (from pmdarima) (49.2.0.post20200714)
Requirement already satisfied: numpy>=1.17.3 in /Applications/anaconda3/lib/python3.8/site-packages (from pmdarima) (1.18.5)
Requirement already satisfied: pandas>=0.19 in /Applications/anaconda3/lib/python3.8/site-packages (from pmdarima) (1.0.5)
Requirement already satisfied: scipy>=1.3.2 in /Applications/anaconda3/lib/python3.8/site-packages (from pmdarima) (1.5.0)
Requirement already satisfied: Cython<0.29.18,>=0.29 in /Applications/anaconda3/lib/python3.8/site-packages (from pmdarima) (0.29.17)
Requirement already satisfied: urllib3 in /Applications/anaconda3/lib/python3.8/site-packages (from pmdarima) (1.25.9)
Requirement already satisfied: joblib>=0.11 in /Applications/anaconda3/lib/python3.8/site-packages (from pmdarima) (0.16.0)
Requirement already satisfied: scikit-learn>=0.22 in /Applications/anaconda3/lib/python3.8/site-packages (from pmdarima) (0.23.1)
Requirement already satisfied: patsy>=0.5 in /Applications/anaconda3/lib/python3.8/site-packages (from statsmodels<0.12,>=0.11->pmdarima) (0.5.1)
Requirement already satisfied: python-dateutil>=2.6.1 in /Applications/anaconda3/lib/python3.8/site-packages (from pandas>=0.19->pmdarima) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /Applications/anaconda3/lib/python3.8/site-packages (from pandas>=0.19->pmdarima) (2020.1)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Applications/anaconda3/lib/python3.8/site-packages (from scikit-learn>=0.22->pmdarima) (2.1.0)
Requirement already satisfied: six in /Applications/anaconda3/lib/python3.8/site-packages (from patsy>=0.5->statsmodels<0.12,>=0.11->pmdarima) (1.15.0)
Note: you may need to restart the kernel to use updated packages.
In [62]:
from pmdarima import auto_arima
In [63]:
model_train=datewise.iloc[:int(datewise.shape[0]*0.95)]
valid=datewise.iloc[int(datewise.shape[0]*0.95):]
y_pred=valid.copy()
In [64]:
model_ar= auto_arima(model_train["Confirmed"],trace=True, error_action='ignore', start_p=0,start_q=0,max_p=4,max_q=0,
                   suppress_warnings=True,stepwise=False,seasonal=False)
model_ar.fit(model_train["Confirmed"])
 ARIMA(0,2,0)(0,0,0)[0] intercept   : AIC=5238.720, Time=0.06 sec
 ARIMA(1,2,0)(0,0,0)[0] intercept   : AIC=5226.902, Time=0.03 sec
 ARIMA(2,2,0)(0,0,0)[0] intercept   : AIC=5227.881, Time=0.03 sec
 ARIMA(3,2,0)(0,0,0)[0] intercept   : AIC=5188.886, Time=0.04 sec
 ARIMA(4,2,0)(0,0,0)[0] intercept   : AIC=5143.415, Time=0.07 sec
Total fit time: 0.241 seconds
Out[64]:
ARIMA(order=(4, 2, 0), scoring_args={}, suppress_warnings=True)
In [65]:
prediction_ar=model_ar.predict(len(valid))
y_pred["AR Model Prediction"]=prediction_ar
In [66]:
model_scores.append(np.sqrt(mean_squared_error(y_pred["Confirmed"],y_pred["AR Model Prediction"])))
print("Root Mean Square Error for AR Model: ",np.sqrt(mean_squared_error(y_pred["Confirmed"],y_pred["AR Model Prediction"])))
Root Mean Square Error for AR Model:  37251.696672764076
In [67]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Confirmed"],
                    mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["Confirmed"],
                    mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["AR Model Prediction"],
                    mode='lines+markers',name="Prediction of Confirmed Cases",))
fig.update_layout(title="Confirmed Cases AR Model Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [69]:
AR_model_new_prediction=[]
for i in range(1,18):
    AR_model_new_prediction.append(model_ar.predict(len(valid)+i)[-1])
model_predictions["AR Model Prediction"]=AR_model_new_prediction
model_predictions.head()
Out[69]:
Dates Linear Regression Prediction Polynonmial Regression Prediction SVM Prediction AR Model Prediction
0 2020-09-24 23113975.222954 34318786.968828 46306357.160791 32075547.701533
1 2020-09-25 23231108.623613 34952218.800302 47364738.610866 32361180.435583
2 2020-09-26 23348242.024271 35625017.197156 48444763.017170 32646419.980162
3 2020-09-27 23465375.424930 36340167.331798 49546783.004272 32935229.764002
4 2020-09-28 23582508.825588 37100814.713970 50671155.488235 33228808.048598

2--MA Model (using AUTO ARIMA)

In [70]:
model_train=datewise.iloc[:int(datewise.shape[0]*0.95)]
valid=datewise.iloc[int(datewise.shape[0]*0.95):]
y_pred=valid.copy()
In [71]:
model_ma= auto_arima(model_train["Confirmed"],trace=True, error_action='ignore', start_p=0,start_q=0,max_p=0,max_q=2,
                   suppress_warnings=True,stepwise=False,seasonal=False)
model_ma.fit(model_train["Confirmed"])
 ARIMA(0,2,0)(0,0,0)[0] intercept   : AIC=5238.720, Time=0.01 sec
 ARIMA(0,2,1)(0,0,0)[0] intercept   : AIC=5218.406, Time=0.03 sec
 ARIMA(0,2,2)(0,0,0)[0] intercept   : AIC=5226.234, Time=0.05 sec
Total fit time: 0.102 seconds
Out[71]:
ARIMA(order=(0, 2, 1), scoring_args={}, suppress_warnings=True)
In [72]:
prediction_ma=model_ma.predict(len(valid))
y_pred["MA Model Prediction"]=prediction_ma
In [73]:
model_scores.append(np.sqrt(mean_squared_error(valid["Confirmed"],prediction_ma)))
print("Root Mean Square Error for MA Model: ",np.sqrt(mean_squared_error(valid["Confirmed"],prediction_ma)))
Root Mean Square Error for MA Model:  62945.04576803569
In [74]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Confirmed"],
                    mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["Confirmed"],
                    mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["MA Model Prediction"],
                    mode='lines+markers',name="Prediction for Confirmed Cases",))
fig.update_layout(title="Confirmed Cases MA Model Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [75]:
MA_model_new_prediction=[]
for i in range(1,18):
    MA_model_new_prediction.append(model_ma.predict(len(valid)+i)[-1])
model_predictions["MA Model Prediction"]=MA_model_new_prediction
model_predictions.head()
Out[75]:
Dates Linear Regression Prediction Polynonmial Regression Prediction SVM Prediction AR Model Prediction MA Model Prediction
0 2020-09-24 23113975.222954 34318786.968828 46306357.160791 32075547.701533 32179131.037750
1 2020-09-25 23231108.623613 34952218.800302 47364738.610866 32361180.435583 32477987.568195
2 2020-09-26 23348242.024271 35625017.197156 48444763.017170 32646419.980162 32778432.149959
3 2020-09-27 23465375.424930 36340167.331798 49546783.004272 32935229.764002 33080464.783041
4 2020-09-28 23582508.825588 37100814.713970 50671155.488235 33228808.048598 33384085.467443

3--ARIMA Model (using AUTOARIMA)

In [76]:
model_train=datewise.iloc[:int(datewise.shape[0]*0.95)]
valid=datewise.iloc[int(datewise.shape[0]*0.95):]
y_pred=valid.copy()
In [77]:
model_arima= auto_arima(model_train["Confirmed"],trace=True, error_action='ignore', start_p=1,start_q=1,max_p=3,max_q=3,
                   suppress_warnings=True,stepwise=False,seasonal=False)
model_arima.fit(model_train["Confirmed"])
 ARIMA(0,2,0)(0,0,0)[0] intercept   : AIC=5238.720, Time=0.02 sec
 ARIMA(0,2,1)(0,0,0)[0] intercept   : AIC=5218.406, Time=0.03 sec
 ARIMA(0,2,2)(0,0,0)[0] intercept   : AIC=5226.234, Time=0.05 sec
 ARIMA(0,2,3)(0,0,0)[0] intercept   : AIC=5226.683, Time=0.09 sec
 ARIMA(1,2,0)(0,0,0)[0] intercept   : AIC=5226.902, Time=0.02 sec
 ARIMA(1,2,1)(0,0,0)[0] intercept   : AIC=5220.548, Time=0.08 sec
 ARIMA(1,2,2)(0,0,0)[0] intercept   : AIC=5375.066, Time=0.14 sec
 ARIMA(1,2,3)(0,0,0)[0] intercept   : AIC=5212.702, Time=0.10 sec
 ARIMA(2,2,0)(0,0,0)[0] intercept   : AIC=5227.881, Time=0.02 sec
 ARIMA(2,2,1)(0,0,0)[0] intercept   : AIC=5222.985, Time=0.16 sec
 ARIMA(2,2,2)(0,0,0)[0] intercept   : AIC=5224.692, Time=0.41 sec
 ARIMA(2,2,3)(0,0,0)[0] intercept   : AIC=5208.092, Time=0.20 sec
 ARIMA(3,2,0)(0,0,0)[0] intercept   : AIC=5188.886, Time=0.06 sec
 ARIMA(3,2,1)(0,0,0)[0] intercept   : AIC=5133.067, Time=0.13 sec
 ARIMA(3,2,2)(0,0,0)[0] intercept   : AIC=5086.450, Time=0.44 sec
Total fit time: 1.955 seconds
Out[77]:
ARIMA(order=(3, 2, 2), scoring_args={}, suppress_warnings=True)
In [78]:
prediction_arima=model_arima.predict(len(valid))
y_pred["ARIMA Model Prediction"]=prediction_arima
In [79]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Confirmed"],
                    mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["Confirmed"],
                    mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["ARIMA Model Prediction"],
                    mode='lines+markers',name="Prediction for Confirmed Cases",))
fig.update_layout(title="Confirmed Cases ARIMA Model Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [80]:
ARIMA_model_new_prediction=[]
for i in range(1,18):
    ARIMA_model_new_prediction.append(model_arima.predict(len(valid)+i)[-1])
model_predictions["ARIMA Model Prediction"]=ARIMA_model_new_prediction
model_predictions.head()
Out[80]:
Dates Linear Regression Prediction Polynonmial Regression Prediction SVM Prediction AR Model Prediction MA Model Prediction ARIMA Model Prediction
0 2020-09-24 23113975.222954 34318786.968828 46306357.160791 32075547.701533 32179131.037750 32377591.069139
1 2020-09-25 23231108.623613 34952218.800302 47364738.610866 32361180.435583 32477987.568195 32730494.816233
2 2020-09-26 23348242.024271 35625017.197156 48444763.017170 32646419.980162 32778432.149959 33061595.303452
3 2020-09-27 23465375.424930 36340167.331798 49546783.004272 32935229.764002 33080464.783041 33373293.539074
4 2020-09-28 23582508.825588 37100814.713970 50671155.488235 33228808.048598 33384085.467443 33685407.521220

4--SARIMA Model (using AUTO ARIMA)

In [81]:
model_sarima= auto_arima(model_train["Confirmed"],trace=True, error_action='ignore', 
                         start_p=0,start_q=0,max_p=2,max_q=2,m=7,
                   suppress_warnings=True,stepwise=True,seasonal=True)
model_sarima.fit(model_train["Confirmed"])
Performing stepwise search to minimize aic
 ARIMA(0,2,0)(1,0,1)[7]             : AIC=5171.866, Time=0.25 sec
 ARIMA(0,2,0)(0,0,0)[7]             : AIC=5237.673, Time=0.01 sec
 ARIMA(1,2,0)(1,0,0)[7]             : AIC=5137.797, Time=0.06 sec
 ARIMA(0,2,1)(0,0,1)[7]             : AIC=5179.694, Time=0.06 sec
 ARIMA(1,2,0)(0,0,0)[7]             : AIC=5226.426, Time=0.01 sec
 ARIMA(1,2,0)(2,0,0)[7]             : AIC=5123.124, Time=0.11 sec
 ARIMA(1,2,0)(2,0,1)[7]             : AIC=5117.732, Time=0.22 sec
 ARIMA(1,2,0)(1,0,1)[7]             : AIC=5063.135, Time=0.33 sec
 ARIMA(1,2,0)(0,0,1)[7]             : AIC=5189.197, Time=0.05 sec
 ARIMA(1,2,0)(1,0,2)[7]             : AIC=5117.727, Time=0.20 sec
 ARIMA(1,2,0)(0,0,2)[7]             : AIC=5160.948, Time=0.10 sec
 ARIMA(1,2,0)(2,0,2)[7]             : AIC=5119.085, Time=0.26 sec
 ARIMA(2,2,0)(1,0,1)[7]             : AIC=5060.706, Time=0.48 sec
 ARIMA(2,2,0)(0,0,1)[7]             : AIC=5189.370, Time=0.06 sec
 ARIMA(2,2,0)(1,0,0)[7]             : AIC=5136.917, Time=0.06 sec
 ARIMA(2,2,0)(2,0,1)[7]             : AIC=5117.700, Time=0.25 sec
 ARIMA(2,2,0)(1,0,2)[7]             : AIC=5117.699, Time=0.23 sec
 ARIMA(2,2,0)(0,0,0)[7]             : AIC=5227.237, Time=0.01 sec
 ARIMA(2,2,0)(0,0,2)[7]             : AIC=5162.863, Time=0.11 sec
 ARIMA(2,2,0)(2,0,0)[7]             : AIC=5121.167, Time=0.15 sec
 ARIMA(2,2,0)(2,0,2)[7]             : AIC=5118.678, Time=0.37 sec
 ARIMA(2,2,1)(1,0,1)[7]             : AIC=5089.049, Time=0.19 sec
 ARIMA(1,2,1)(1,0,1)[7]             : AIC=5059.530, Time=0.43 sec
 ARIMA(1,2,1)(0,0,1)[7]             : AIC=5178.971, Time=0.06 sec
 ARIMA(1,2,1)(1,0,0)[7]             : AIC=5124.906, Time=0.13 sec
 ARIMA(1,2,1)(2,0,1)[7]             : AIC=5108.695, Time=0.45 sec
 ARIMA(1,2,1)(1,0,2)[7]             : AIC=5108.737, Time=0.43 sec
 ARIMA(1,2,1)(0,0,0)[7]             : AIC=5219.617, Time=0.05 sec
 ARIMA(1,2,1)(0,0,2)[7]             : AIC=5144.524, Time=0.38 sec
 ARIMA(1,2,1)(2,0,0)[7]             : AIC=5109.597, Time=0.30 sec
 ARIMA(1,2,1)(2,0,2)[7]             : AIC=5109.182, Time=0.57 sec
 ARIMA(0,2,1)(1,0,1)[7]             : AIC=5116.875, Time=0.11 sec
 ARIMA(1,2,2)(1,0,1)[7]             : AIC=5052.230, Time=0.44 sec
 ARIMA(1,2,2)(0,0,1)[7]             : AIC=5171.898, Time=0.10 sec
 ARIMA(1,2,2)(1,0,0)[7]             : AIC=5094.482, Time=0.22 sec
 ARIMA(1,2,2)(2,0,1)[7]             : AIC=5091.770, Time=0.34 sec
 ARIMA(1,2,2)(1,0,2)[7]             : AIC=5092.019, Time=0.44 sec
 ARIMA(1,2,2)(0,0,0)[7]             : AIC=5202.676, Time=0.06 sec
 ARIMA(1,2,2)(0,0,2)[7]             : AIC=5134.273, Time=0.24 sec
 ARIMA(1,2,2)(2,0,0)[7]             : AIC=5094.481, Time=0.23 sec
 ARIMA(1,2,2)(2,0,2)[7]             : AIC=5091.484, Time=0.46 sec
 ARIMA(0,2,2)(1,0,1)[7]             : AIC=5094.059, Time=0.14 sec
 ARIMA(2,2,2)(1,0,1)[7]             : AIC=inf, Time=0.36 sec
 ARIMA(1,2,2)(1,0,1)[7] intercept   : AIC=inf, Time=0.38 sec

Best model:  ARIMA(1,2,2)(1,0,1)[7]          
Total fit time: 9.915 seconds
Out[81]:
ARIMA(order=(1, 2, 2), scoring_args={}, seasonal_order=(1, 0, 1, 7),
      suppress_warnings=True, with_intercept=False)
In [82]:
prediction_sarima=model_sarima.predict(len(valid))
y_pred["SARIMA Model Prediction"]=prediction_sarima
In [83]:
model_scores.append(np.sqrt(mean_squared_error(y_pred["Confirmed"],y_pred["SARIMA Model Prediction"])))
print("Root Mean Square Error for SARIMA Model: ",np.sqrt(mean_squared_error(y_pred["Confirmed"],y_pred["SARIMA Model Prediction"])))
Root Mean Square Error for SARIMA Model:  105870.68779135532
In [84]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Confirmed"],
                    mode='lines+markers',name="Train Data for Confirmed Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["Confirmed"],
                    mode='lines+markers',name="Validation Data for Confirmed Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["SARIMA Model Prediction"],
                    mode='lines+markers',name="Prediction for Confirmed Cases",))
fig.update_layout(title="Confirmed Cases SARIMA Model Prediction",
                 xaxis_title="Date",yaxis_title="Confirmed Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [85]:
SARIMA_model_new_prediction=[]
for i in range(1,18):
    SARIMA_model_new_prediction.append(model_sarima.predict(len(valid)+i)[-1])
model_predictions["SARIMA Model Prediction"]=SARIMA_model_new_prediction
model_predictions.head()
Out[85]:
Dates Linear Regression Prediction Polynonmial Regression Prediction SVM Prediction AR Model Prediction MA Model Prediction ARIMA Model Prediction SARIMA Model Prediction
0 2020-09-24 23113975.222954 34318786.968828 46306357.160791 32075547.701533 32179131.037750 32377591.069139 31905350.926105
1 2020-09-25 23231108.623613 34952218.800302 47364738.610866 32361180.435583 32477987.568195 32730494.816233 32201468.215238
2 2020-09-26 23348242.024271 35625017.197156 48444763.017170 32646419.980162 32778432.149959 33061595.303452 32470985.304127
3 2020-09-27 23465375.424930 36340167.331798 49546783.004272 32935229.764002 33080464.783041 33373293.539074 32710861.945476
4 2020-09-28 23582508.825588 37100814.713970 50671155.488235 33228808.048598 33384085.467443 33685407.521220 32957592.207288

Summarization of Forecasts using different Models¶

In [86]:
model_names=["Linear Regression","Polynomial Regression","Support Vector Machine Regressor",
            "Auto Regressive Model (AR)","Moving Average Model (MA)","ARIMA Model","SARIMA Model"]
model_summary=pd.DataFrame(zip(model_names,model_scores),columns=["Model Name","Root Mean Squared Error"]).sort_values(["Root Mean Squared Error"])
model_summary
Out[86]:
Model Name Root Mean Squared Error
3 Auto Regressive Model (AR) 37251.696673
4 Moving Average Model (MA) 62945.045768
5 ARIMA Model 105870.687791
1 Polynomial Regression 985900.280129
0 Linear Regression 7830571.030602
2 Support Vector Machine Regressor 9793495.122401
In [ ]:
 

Time Series Forecasting for Death Cases

In [88]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Deaths"],
                    mode='lines+markers',name="Death Cases"))
fig.update_layout(title="Death Cases",
                 xaxis_title="Date",yaxis_title="Number of Death Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [89]:
model_train=datewise.iloc[:int(datewise.shape[0]*0.95)]
valid=datewise.iloc[int(datewise.shape[0]*0.95):]
y_pred=valid.copy()
In [90]:
model_arima_deaths=auto_arima(model_train["Deaths"],trace=True, error_action='ignore', start_p=0,start_q=0,
                              max_p=5,max_q=5,suppress_warnings=True,stepwise=False,seasonal=False)     
model_arima_deaths.fit(model_train["Deaths"])
 ARIMA(0,2,0)(0,0,0)[0] intercept   : AIC=4002.845, Time=0.01 sec
 ARIMA(0,2,1)(0,0,0)[0] intercept   : AIC=3964.048, Time=0.13 sec
 ARIMA(0,2,2)(0,0,0)[0] intercept   : AIC=3948.317, Time=0.12 sec
 ARIMA(0,2,3)(0,0,0)[0] intercept   : AIC=3941.841, Time=0.16 sec
 ARIMA(0,2,4)(0,0,0)[0] intercept   : AIC=3911.075, Time=0.34 sec
 ARIMA(0,2,5)(0,0,0)[0] intercept   : AIC=3909.056, Time=0.38 sec
 ARIMA(1,2,0)(0,0,0)[0] intercept   : AIC=3989.453, Time=0.03 sec
 ARIMA(1,2,1)(0,0,0)[0] intercept   : AIC=3947.299, Time=0.21 sec
 ARIMA(1,2,2)(0,0,0)[0] intercept   : AIC=3948.415, Time=0.13 sec
 ARIMA(1,2,3)(0,0,0)[0] intercept   : AIC=3936.098, Time=0.30 sec
 ARIMA(1,2,4)(0,0,0)[0] intercept   : AIC=3905.012, Time=0.39 sec
 ARIMA(2,2,0)(0,0,0)[0] intercept   : AIC=3990.160, Time=0.03 sec
 ARIMA(2,2,1)(0,0,0)[0] intercept   : AIC=3946.507, Time=0.17 sec
 ARIMA(2,2,2)(0,0,0)[0] intercept   : AIC=3946.026, Time=0.31 sec
 ARIMA(2,2,3)(0,0,0)[0] intercept   : AIC=3878.215, Time=0.35 sec
 ARIMA(3,2,0)(0,0,0)[0] intercept   : AIC=3956.526, Time=0.04 sec
 ARIMA(3,2,1)(0,0,0)[0] intercept   : AIC=3912.152, Time=0.28 sec
 ARIMA(3,2,2)(0,0,0)[0] intercept   : AIC=3882.068, Time=0.30 sec
 ARIMA(4,2,0)(0,0,0)[0] intercept   : AIC=3941.440, Time=0.06 sec
 ARIMA(4,2,1)(0,0,0)[0] intercept   : AIC=3907.756, Time=0.19 sec
 ARIMA(5,2,0)(0,0,0)[0] intercept   : AIC=3893.866, Time=0.07 sec
Total fit time: 4.018 seconds
Out[90]:
ARIMA(order=(2, 2, 3), scoring_args={}, suppress_warnings=True)
In [91]:
predictions_deaths=model_arima_deaths.predict(len(valid))
y_pred["ARIMA Death Prediction"]=predictions_deaths
In [92]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=model_train.index, y=model_train["Deaths"],
                    mode='lines+markers',name="Train Data for Death Cases"))
fig.add_trace(go.Scatter(x=valid.index, y=valid["Deaths"],
                    mode='lines+markers',name="Validation Data for Death Cases",))
fig.add_trace(go.Scatter(x=valid.index, y=y_pred["ARIMA Death Prediction"],
                    mode='lines+markers',name="Prediction for Death Cases",))
fig.update_layout(title="Death Cases ARIMA Model Prediction",
                 xaxis_title="Date",yaxis_title="Death Cases",legend=dict(x=0,y=1,traceorder="normal"))
fig.show()
In [93]:
ARIMA_model_death_forecast=[]
for i in range(1,18):
    ARIMA_model_death_forecast.append(model_arima_deaths.predict(len(valid)+i)[-1])
In [94]:
pd.DataFrame(zip(new_date,ARIMA_model_death_forecast),columns=["Deaths","ARIMA Model Death Forecast"]).head()
Out[94]:
Deaths ARIMA Model Death Forecast
0 2020-09-24 995464.194459
1 2020-09-25 1001930.221154
2 2020-09-26 1008365.874338
3 2020-09-27 1014686.390085
4 2020-09-28 1020908.730652

COVID-19 doesn't have very high mortatlity rate as we can see which is the most positive take away. Also the healthy Recovery Rate implies the disease is cureable. The only matter of concern is the exponential growth rate of infection.